In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyjags
import pickle
from __future__ import division, print_function
from pandas.tools.plotting import *
from matplotlib import rcParams
rcParams["savefig.dpi"] = 100
rcParams["font.size"] = 20
#plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#%qtconsole
In [ ]:
## function to draw from truncated normal, this function will be used for both the
## one- and two-componenet cases in this workbook.
def rnorm_bound( Ndata, mu, prec, lower_bound = 0.0, upper_bound = float('Inf')):
x = np.zeros(Ndata)
#print(x)
for i in range(0, Ndata):
#print(i)
while True:
x[i] = np.random.normal(mu,prec,1)
if( (x[i]>lower_bound) and (x[i]<upper_bound) ):
break
return x;
In [ ]:
## Check the output to make sure the function is outputting as expected.
#print(np.random.normal(0,0.0001,10))
print(rnorm_bound(10, 0.0, 0.001, lower_bound=-1,upper_bound=1))
In [ ]:
## One-component Gaussian Mixture Simulated Data for both projected eccentricity terms
## Below we designate the population values of our generative model. These are the
## truths that we should recover if our hiararchical Bayesian model is properly specified
## and diagnostics have indicated that the simulation has "not not converged". "You can't
## prove convergence, at best you can fail to prove a failure to converge".
## In this simulated data set, their are 25 planetary systems (with one planet each)
Ndata = 25
## Here we asign the dispersion of the simulated population to be 0.3, this is
## the truth we wish to recovern
sigmae = 0.3
## We approximate the uncertainty for each measurement as normally distributed about a
## reporte measurement point estimate.
## For the eccentricity distribution for Hot Jupiters, the physical models used to derive
## these produce larger uncertainty in k by a factor of 2.
sigmahobs = 0.04
sigmakobs = 0.08
h = np.repeat(0.,Ndata)
k = np.repeat(0.,Ndata)
hhat = np.repeat(0.,Ndata)
khat = np.repeat(0.,Ndata)
hhat_sigma = np.repeat(sigmahobs,Ndata)
khat_sigma = np.repeat(sigmakobs,Ndata)
#print(khat_sigma)
for i in range(0,Ndata):
h[i] = rnorm_bound(1,0,sigmae,lower_bound=-1,upper_bound=1)
lb = -np.sqrt(1-h[i]**2)
ub = np.sqrt(1-h[i]**2)
k[i] = rnorm_bound(1,0,sigmae,lower_bound=lb,upper_bound=ub)
hhat[i] = rnorm_bound(1,h[i],sigmahobs,lower_bound=-1,upper_bound=1)
khat[i] = rnorm_bound(1,k[i],sigmakobs,lower_bound=lb,upper_bound=ub)
## Vizualize the true data values, and the simulated measurements:
print(h, hhat, k, khat)
plt.hist(h)
plt.hist(hhat)
In [ ]:
## JAGS user manual:
## http://www.uvm.edu/~bbeckage/Teaching/DataAnalysis/Manuals/manual.jags.pdf
## JAGS model code
## model code is the one-component truncated gaussian mixture HBM code from previous
## notebook
code1 = '''
model {
#Population parameters
e_sigma ~ dunif(0.0, 1.0)
e_phi <- 1/(e_sigma*e_sigma)
for (n in 1:Ndata){
#True planet properties
h[n] ~ dnorm(0, e_phi) T(-1,1) #Can try multivariate truncated normal in future
k[n] ~ dnorm(0, e_phi) T(-sqrt(1-h[n]*h[n]),sqrt(1-h[n]*h[n]))
#Observed planet properties
hhat[n] ~ dnorm(h[n], 1.0/(hhat_sigma[n]*hhat_sigma[n])) T(-1,1)
khat[n] ~ dnorm(k[n], 1.0/(khat_sigma[n]*khat_sigma[n])) T(-sqrt(1-hhat[n]*hhat[n]),sqrt(1-hhat[n]*hhat[n]))
}
}
'''
In [ ]:
## Load additional JAGS module
pyjags.load_module('glm')
pyjags.load_module('dic')
## See blog post for origination of the adapted analysis tools used here and below:
## https://martynplummer.wordpress.com/2016/01/11/pyjags/
num_chains = 4
iterations = 10000
## data list include only variables in the model
model = pyjags.Model(code1, data=dict( Ndata=Ndata, hhat=hhat, khat=khat,
hhat_sigma=hhat_sigma, khat_sigma=khat_sigma),
chains=num_chains, adapt=1000)
## Code to speed up compute time. This feature might not be
## well tested in pyjags at this time.
## threads=4, chains_per_thread=1
## 500 warmup / burn-in iterations, not used for inference.
model.sample(500, vars=[])
## Run model for desired steps, monitoring hyperparameter variables, and latent variables
## for hierarchical Bayesian model.
## Returns a dictionary with numpy array for each monitored variable.
## Shapes of returned arrays are (... shape of variable ..., iterations, chains).
## samples = model.sample(#iterations per chain here, vars=['e_sigma', 'h', 'k'])
samples = model.sample(iterations, vars=['e_sigma', 'h', 'k'])
## Code to save, open and use pickled dictionary of samples:
## -- Pickle the data --
#with open('ecc_1_test.pkl', 'wb') as handle:
# pickle.dump(samples, handle)
## -- Retrieve pickled data --
#with open('ecc_1_test.pkl', 'rb') as handle:
# retrieved_results = pickle.load(handle)
In [ ]:
## Generalize code for both h and k below.
#print(samples)
#print(samples.items())
## Print and check the shape of the resultant samples dictionary:
print(samples['e_sigma'].shape)
print(samples['e_sigma'].squeeze(0).shape)
print(samples['h'].shape)
print(samples['k'][0,:,:].shape)
print('-----')
## Update the samples dictionary so that it includes keys for the latent variables
## Also, we will use LaTeX formatting to help make legible plots ahead.
samples_Nm1 = {}
## adjust the thin varible to only look at every 10th population element by setting it to 10
thin = 1
## Need to enter the number of hyperparameter variables here:
numHyperParams = 1
## Specify the dimension we want for our plot below, for legibility.
dim = (Ndata/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Ndata,thin):
#hval = 'h'+str(i+1)
#kval = 'k'+str(i+1)
#print(hval)
#print(kval)
#samples_Nm1({hval: samples['h'][i,:,:]})
samples_Nm1.update({'$h_{'+str(i+1)+'}$': samples['h'][i,:,:], '$k_{'+str(i+1)+'}$': samples['k'][i,:,:]})
#print(samples_2['h11'].shape)
## Add the hyperparameter marginal posterior back in:
samples_Nm1.update({'$e_{\sigma}$': samples['e_sigma'].squeeze(0)})
## Below, examine the updated and reformatted sample dictionary to include keys for
## latent variables
for j, i in samples_Nm1.items():
print(j)
print(i)
samples_Nm1['$h_{5}$'][0]
Below is code to look at summary statistics of the marginal posterior distributions (the probabilistic parameter estimates) for the hyperparameter and the latent variables (each population constituent), in this case h and k (a.k.a projected eccentricity here), of the exoplanet systems we are simulating).
In [ ]:
## equal tailed 95% credible intervals, and posterior distribution means:
def summary(samples, varname, p=95):
values = samples[varname][0]
ci = np.percentile(values, [100-p, p])
print('{:<6} mean = {:>5.1f}, {}% credible interval [{:>4.1f} {:>4.1f}]'.format(
varname, np.mean(values), p, *ci))
for varname in samples_Nm1:
summary(samples_Nm1, varname)
In [ ]:
## Use pandas three dimensional Panel to represent the trace:
trace = pd.Panel({k: v for k, v in samples_Nm1.items()})
trace.axes[0].name = 'Variable'
trace.axes[1].name = 'Iteration'
trace.axes[2].name = 'Chain'
## Point estimates:
print(trace.to_frame().mean())
## Bayesian equal-tailed 95% credible intervals:
print(trace.to_frame().quantile([0.05, 0.95]))
## ^ entering the values here could be a good question part
def plot(trace, var):
fig, axes = plt.subplots(1, 3, figsize=(9, 3))
fig.suptitle(var, y=0.95, fontsize='xx-large')
## Marginal posterior density estimate:
trace[var].plot.density(ax=axes[0])
axes[0].set_xlabel('Parameter value')
axes[0].locator_params(tight=True)
## Autocorrelation for each chain:
axes[1].set_xlim(0, 100)
for chain in trace[var].columns:
autocorrelation_plot(trace[var,:,chain], axes[1], label=chain)
## Trace plot:
axes[2].set_ylabel('Parameter value')
trace[var].plot(ax=axes[2])
## Save figure
filename = var.replace("\\", "")
filename = filename.replace("$", "")
filename = filename.replace("}", "")
filename = filename.replace("{", "")
plt.tight_layout(pad=3)
fig.savefig('Nm1_JAGS_h_and_k_'+'{}.png'.format(filename))
# Display diagnostic plots
for var in trace:
plot(trace, var)
In [ ]:
## Scatter matrix plot:
## Redefine the trace so that we only vizualize every 10th latent variable element in
## the scatter_matrix plot below. Vizualizing all 50 is too cumbersome for the scatter
## matrix.
samples_Nm1_for_scatter_matrix = {}
Nsamp=25
## adjust the thin varible to only look at every 10th population element by setting it to 10
thin = 10
numHyperParams = 1
dim = (Nsamp/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Nsamp,thin):
samples_Nm1_for_scatter_matrix.update({'$h_{'+str(i+1)+'}$': samples['h'][i,:,:], '$k_{'+str(i+1)+'}$': samples['k'][i,:,:]})
samples_Nm1_for_scatter_matrix.update({'$e_{\sigma}$': samples['e_sigma'].squeeze(0)})
for j, i in samples_Nm1_for_scatter_matrix.items():
print(j)
# print(i)
trace_2 = pd.Panel({k: v for k, v in samples_Nm1_for_scatter_matrix.items()})
sm = scatter_matrix(trace_2.to_frame(), color="darkturquoise", alpha=0.2, figsize=(dim*2, dim*2), diagonal='hist',hist_kwds={'bins':25,'histtype':'step', 'edgecolor':'r','linewidth':2})
## y labels size
[plt.setp(item.yaxis.get_label(), 'size', 20) for item in sm.ravel()]
## x labels size
[plt.setp(item.xaxis.get_label(), 'size', 20) for item in sm.ravel()]
## Change label rotation
## This is helpful for very long labels
#[s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
[s.xaxis.label.set_rotation(0) for s in sm.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]
## May need to offset label when rotating to prevent overlap of figure
[s.get_yaxis().set_label_coords(-0.5,0.5) for s in sm.reshape(-1)]
## Hide all ticks
#[s.set_xticks(()) for s in sm.reshape(-1)]
#[s.set_yticks(()) for s in sm.reshape(-1)]
plt.savefig('scatter_matrix_Nm1_JAGS.png')
In [ ]:
## Below we designate the population values of our two-component truncated Gaussian
## generative model. These are the truths that we should recover if our hiararchical
## Bayesian model is properly specified and diagnostics have indicated that the simulation
## has "not not converged".
Ndata = 50
Nm = 2
frac = [0.7,0.3]
sigmae = [0.05,0.3]
## After generating values from the population model, we now add realistic Gaussian noise
## to create simulated measurements.
sigmahobs = 0.04
sigmakobs = 0.08
h = np.repeat(0.,Ndata)
k = np.repeat(0.,Ndata)
hhat = np.repeat(0.,Ndata)
khat = np.repeat(0.,Ndata)
hhat_sigma = np.repeat(sigmahobs,Ndata)
khat_sigma = np.repeat(sigmakobs,Ndata)
#print(khat_sigma)
for i in range(0,Ndata):
#print('i')
#print(i)
c = np.random.choice(len(frac), 1, p=frac, replace=True)
#print(int(c))
h[i] = rnorm_bound(1,0,sigmae[int(c)],lower_bound=-1,upper_bound=1)
# Euler's formula: h^2 + k^2 = 1
lb = -np.sqrt(1-h[i]**2)
ub = np.sqrt(1-h[i]**2)
k[i] = rnorm_bound(1,0,sigmae[int(c)],lower_bound=lb,upper_bound=ub)
hhat[i] = rnorm_bound(1,h[i],sigmahobs,lower_bound=-1,upper_bound=1)
khat[i] = rnorm_bound(1,k[i],sigmakobs,lower_bound=lb,upper_bound=ub)
print(h, hhat, k, khat)
In [ ]:
# JAGS model code
code = '''
model {
#Population parameters
for (j in 1:Nm) {
e_sigma[j] ~ dunif(0.0, 1.0)
e_phi[j] <- 1/(e_sigma[j]*e_sigma[j])
a[j] <- 1;
}
f ~ ddirch(a[])
for (n in 1:Ndata){
#True planet properties
c[n] ~ dcat(f[])
h[n] ~ dnorm(0, e_phi[c[n]]) T(-1,1) #Can try multivariate truncated normal in future
k[n] ~ dnorm(0, e_phi[c[n]]) T(-sqrt(1-h[n]*h[n]),sqrt(1-h[n]*h[n]))
#Observed planet properties
hhat[n] ~ dnorm(h[n], 1.0/(hhat_sigma[n]*hhat_sigma[n])) T(-1,1)
khat[n] ~ dnorm(k[n], 1.0/(khat_sigma[n]*khat_sigma[n])) T(-sqrt(1-hhat[n]*hhat[n]),sqrt(1-hhat[n]*hhat[n]))
}
}
'''
In [ ]:
# Load additional JAGS module
pyjags.load_module('glm')
pyjags.load_module('dic')
#data list include only variables in the model
model = pyjags.Model(code, data=dict(Nm=Nm, Ndata=Ndata, hhat=hhat, khat=khat,
hhat_sigma=hhat_sigma, khat_sigma=khat_sigma),
chains=4, adapt=1000)
# threads=4, chains_per_thread=1
# 500 warmup / burn-in iterations, not used for inference.
model.sample(500, vars=[])
## Run model for desired steps, monitoring hyperparameter variables.
## Returns a dictionary with numpy array for each monitored variable.
## Shapes of returned arrays are (... shape of variable ..., iterations, chains).
##
iters = 1000000
samples2 = model.sample(iters, vars=['e_sigma', 'h', 'k', 'c', 'f'])
# Pickle the data
#with open('ecc_1_test.pkl', 'wb') as handle:
# pickle.dump(samples, handle)
# Retrieve pickled data
# with open('ecc_1_test.pkl', 'rb') as handle:
# retrieved_results = pickle.load(handle)
Sort the high and low mixture components to accomidate the exchangeability of parameters
In [ ]:
chain_thin = 100
start = int(iters-1000)
esigma_low = np.where(samples2['e_sigma'][0,start::,:] <= samples2['e_sigma'][1,start::,:], samples2['e_sigma'][0,start::,:], samples2['e_sigma'][1,start::,:])
esigma_hi = np.where(samples2['e_sigma'][0,start::,:] > samples2['e_sigma'][1,start::,:], samples2['e_sigma'][0,start::,:], samples2['e_sigma'][1,start::,:])
f_low = np.where(samples2['e_sigma'][0,start::,:] <= samples2['e_sigma'][1,start::,:], samples2['f'][0,start::,:], samples2['f'][1,start::,:])
f_hi = np.where(samples2['e_sigma'][0,start::,:] > samples2['e_sigma'][1,start::,:], samples2['f'][0,start::,:], samples2['f'][1,start::,:])
print(np.min(f_hi))
plt.hist(f_low)
In [ ]:
iters = 1000000
#print(samples)
#print(samples.items())
print(samples2['h'].shape)
print(samples2['k'][0,start::,:].shape)
print('-----')
samples_Nm2 = {}
Nsamp=50
thin = 1
numHyperParams = 4
dim = (Nsamp/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Nsamp,thin):
samples_Nm2.update({'$h_{'+str(i+1)+'}$': samples2['h'][i,start::,:], '$k_{'+str(i+1)+'}$': samples2['k'][i,start::,:]})
samples_Nm2.update({'$e_{\sigma_{low}}$': esigma_low, '$e_{\sigma_{high}}$': esigma_hi })
samples_Nm2.update({'$f_{low}$': f_low,'$f_{high}$': f_hi })
for j, i in samples_Nm2.items():
print(j)
#print(i)
In [ ]:
## equal tailed 95% credible intervals, and posterior distribution means:
def summary(samples, varname, p=95):
values = samples[varname][0]
ci = np.percentile(values, [100-p, p])
print('{:<6} mean = {:>5.1f}, {}% credible interval [{:>4.1f} {:>4.1f}]'.format(
varname, np.mean(values), p, *ci))
for varname in samples_Nm2:
summary(samples_Nm2, varname)
In [ ]:
## Use pandas three dimensional Panel to represent the trace:
trace = pd.Panel({k: v for k, v in samples_Nm2.items()})
trace.axes[0].name = 'Variable'
trace.axes[1].name = 'Iteration'
trace.axes[2].name = 'Chain'
## Point estimates:
print(trace.to_frame().mean())
## Bayesian equal-tailed 95% credible intervals:
print(trace.to_frame().quantile([0.05, 0.95]))
## ^ entering the values here could be a good question part
def plot(trace, var):
fig, axes = plt.subplots(1, 3, figsize=(9, 3))
fig.suptitle(var, y=0.95, fontsize='xx-large')
## Marginal posterior density estimate:
trace[var].plot.density(ax=axes[0])
axes[0].set_xlabel('Parameter value')
axes[0].locator_params(tight=True)
## Autocorrelation for each chain:
axes[1].set_xlim(0, 100)
for chain in trace[var].columns:
autocorrelation_plot(trace[var,:,chain], axes[1], label=chain)
## Trace plot:
axes[2].set_ylabel('Parameter value')
trace[var].plot(ax=axes[2])
## Save figure
filename = var.replace("\\", "")
filename = filename.replace("/", "")
filename = filename.replace("$", "")
filename = filename.replace("}", "")
filename = filename.replace("{", "")
plt.tight_layout(pad=3)
fig.savefig('Nm2_JAGS_'+'{}.png'.format(filename))
## Display diagnostic plots
for var in trace:
plot(trace, var)
In [ ]:
## Scatter matrix plot:
## Redefine the trace so that we only vizualize every 10th latent variable element in
## the scatter_matrix plot below. Vizualizing all 50 is too cumbersome for the scatter
## matrix.
samples_Nm2_for_scatter_matrix = {}
## adjust the thin varible to only look at every 10th population element by setting it to 10
thin = 10
numHyperParams = 4
dim = (Ndata/thin)*2 + numHyperParams
print(dim)
for i in np.arange(0,Ndata,thin):
samples_Nm2_for_scatter_matrix.update({'$h_{'+str(i+1)+'}$': samples2['h'][i,start::,:], '$k_{'+str(i+1)+'}$': samples2['k'][i,start::,:]})
samples_Nm2_for_scatter_matrix.update({'$e_{\sigma_{low}}$': esigma_low, '$e_{\sigma_{high}}$': esigma_hi })
samples_Nm2_for_scatter_matrix.update({'$f_{low}$': f_low,'$f_{high}$': f_hi })
for j, i in samples_Nm2_for_scatter_matrix.items():
print(j)
# print(i)
trace_3 = pd.Panel({k: v for k, v in samples_Nm2_for_scatter_matrix.items()})
sm = scatter_matrix(trace_3.to_frame(), color="darkturquoise", alpha=0.2, figsize=(dim*2, dim*2), diagonal='hist',hist_kwds={'bins':25,'histtype':'step', 'edgecolor':'r','linewidth':2})
## y labels size
[plt.setp(item.yaxis.get_label(), 'size', 20) for item in sm.ravel()]
## x labels size
[plt.setp(item.xaxis.get_label(), 'size', 20) for item in sm.ravel()]
## Change label rotation
## This is helpful for very long labels
#[s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
[s.xaxis.label.set_rotation(0) for s in sm.reshape(-1)]
[s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]
## May need to offset label when rotating to prevent overlap of figure
[s.get_yaxis().set_label_coords(-0.5,0.5) for s in sm.reshape(-1)]
## Hide all ticks
#[s.set_xticks(()) for s in sm.reshape(-1)]
#[s.set_yticks(()) for s in sm.reshape(-1)]
plt.savefig('scatter_matrix_Nm2_JAGS.png')
In [ ]: